{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "import json\n",
    "from lxml import etree\n",
    "import os\n",
    "# There are ways to size the image without loading it into memory by reading its headers (https://github.com/scardine/image_size), but seems less reliable.\n",
    "from tqdm import tqdm\n",
    "from eMammal_helpers import clean_species_name, clean_frame_number, get_img_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print all outputs in a cell\n",
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "\n",
    "# auto reload external Python modules\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# display Matplotlib figures inline and set default size\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "%matplotlib inline\n",
    "plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots\n",
    "plt.rcParams['image.interpolation'] = 'nearest'\n",
    "plt.rcParams['image.cmap'] = 'gray'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# configurations and paths\n",
    "output_dir_path = '/home/yasiyu/scripts/output'\n",
    "\n",
    "csv_path = '/home/yasiyu/scripts/input/emammal_2018.08.20.csv'  # csv specifying the images sent for annotation\n",
    "\n",
    "deployments_path = '/datadrive/emammal'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Produce COCO format json database for eMammal \n",
    "\n",
    "Contact: Siyu Yang <cameratraps@lila.science>\n",
    "\n",
    "\n",
    "**The content of this notebook has been refactored into `create_eMammal_json.py`. Please run, and make modifications on, that script instead.**\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "Run this notebook with Python 3.\n",
    "\n",
    "This notebook produces the COCO formatted json database, which contains all the images whether they were annotated with bounding boxes or not. In this process, each image needs to be loaded to size it.\n",
    "\n",
    "Bbox annotations will be added to the database when they arrive.\n",
    "\n",
    "General format defined at http://cocodataset.org/#format-data, and specific to our work, see Sara's [definition](https://ai4edevelopment.visualstudio.com/AI%20for%20Earth%20Development/AI%20for%20Earth%20Development%20Team/_git/cameraTraps?path=%2Fdatabase_tools%2FREADME.md&version=GBmaster).\n",
    "\n",
    "I decided to save the sequence level ResearcherIdentifications info in each image object. Where there are multiple different species, they are semi-column separated.\n",
    "\n",
    "```\n",
    "image{\n",
    "  \"id\" : str,\n",
    "  \"width\" : int,\n",
    "  \"height\" : int,\n",
    "  \"file_name\" : str,\n",
    "  \"rights_holder\" : str,  # not included\n",
    "  \"location\": str,  # not int\n",
    "  \"datetime\": datetime,\n",
    "  \"seq_id\": str,\n",
    "  \"seq_num_frames\": int,\n",
    "  \"frame_num\": int,\n",
    "  \"label\": str  # just for eMammal\n",
    "}\n",
    "```\n",
    "\n",
    "Image ID is in this format:\n",
    "\n",
    "`datasetemammal.projectp100.deploymentd17432.seqd17432s11.frame004.imgd17432s11i4`\n",
    "\n",
    "Image file name:\n",
    "\n",
    "`emammal/3191d36836/d36836s14i1.JPG` or `.jpg`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "db_info = {\n",
    "    'year': 2018,\n",
    "    'version': '0.0.1',\n",
    "    'description': 'eMammal dataset containing 3140 deployments, in COCO format.',\n",
    "    'contributor': 'eMammal',\n",
    "    'date_created': str(datetime.date.today())\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Sequential"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "db_images = []\n",
    "for deployment in tqdm(os.listdir(deployments_path)):\n",
    "    deployment_path = os.path.join(deployments_path, deployment)\n",
    "    manifest_path = os.path.join(deployment_path, 'deployment_manifest.xml')\n",
    "    \n",
    "    with open(manifest_path, 'r') as f:\n",
    "        tree = etree.parse(f)\n",
    "    \n",
    "    root = tree.getroot()\n",
    "    project_id = root.findtext('ProjectId')\n",
    "    deployment_id = root.findtext('CameraDeploymentID')\n",
    "    deployment_location = root.findtext('CameraSiteName')\n",
    "    \n",
    "    image_sequences = root.findall('ImageSequence')\n",
    "    \n",
    "    for sequence in image_sequences:\n",
    "        seq_id = sequence.findtext('ImageSequenceId')\n",
    "        \n",
    "        # get species info for this sequence\n",
    "        researcher_identifications = sequence.findall('ResearcherIdentifications')\n",
    "        species = set()\n",
    "        \n",
    "        for researcher_id in researcher_identifications:\n",
    "            identifications = researcher_id.findall('Identification')\n",
    "            for id in identifications:\n",
    "                species_common_name = clean_species_name(id.findtext('SpeciesCommonName'))\n",
    "                species.add(species_common_name)\n",
    "        \n",
    "        species_str = ';'.join(sorted(list(species)))\n",
    "\n",
    "        # add each image's info to database\n",
    "        images = sequence.findall('Image')\n",
    "        for img in images:\n",
    "            img_id = img.findtext('ImageId')\n",
    "            img_file_name = img.findtext('ImageFileName')\n",
    "            assert img_file_name.endswith('.JPG')\n",
    "            \n",
    "            img_datetime = img.findtext('ImageDateTime')  # these are in different formats...\n",
    "            img_frame = clean_frame_number(img.findtext('ImageOrder'))\n",
    "            \n",
    "            full_img_id = 'datasetemammal.project{}.deployment{}.seq{}.frame{}.img{}'.format(project_id, deployment_id, seq_id, img_frame, img_id)\n",
    "        \n",
    "            full_img_path = os.path.join(deployment_path, img_file_name)\n",
    "            img_width, img_height = get_img_size(full_img_path)\n",
    "        \n",
    "            db_images.append({\n",
    "                'id': full_img_id,\n",
    "                'width': img_width,\n",
    "                'height': img_height,\n",
    "                'file_name': os.path.join('emammal', deployment, img_file_name),\n",
    "                'location': deployment_location,\n",
    "                'datetime': img_datetime,\n",
    "                'seq_id': seq_id,\n",
    "                'seq_num_frames': int(img_frame),\n",
    "                'label': species_str\n",
    "            })\n",
    "        \n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "db_images"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Threaded for faster IO"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import multiprocessing\n",
    "from multiprocessing.dummy import Pool as ThreadPool  # this functions like threading\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _add_image(entry, full_img_path):\n",
    "    img_width, img_height = get_img_size(full_img_path)\n",
    "    entry['width'] = img_width\n",
    "    entry['height'] = img_height\n",
    "    pbar.update(1)\n",
    "    return entry"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tasks = []\n",
    "\n",
    "print('Looping through the deployments...')\n",
    "for deployment in tqdm(os.listdir(deployments_path)):\n",
    "    deployment_path = os.path.join(deployments_path, deployment)\n",
    "    manifest_path = os.path.join(deployment_path, 'deployment_manifest.xml')\n",
    "    \n",
    "    with open(manifest_path, 'r') as f:\n",
    "        tree = etree.parse(f)\n",
    "    \n",
    "    root = tree.getroot()\n",
    "    project_id = root.findtext('ProjectId')\n",
    "    deployment_id = root.findtext('CameraDeploymentID')\n",
    "    deployment_location = root.findtext('CameraSiteName')\n",
    "    \n",
    "    image_sequences = root.findall('ImageSequence')\n",
    "    \n",
    "    for sequence in image_sequences:\n",
    "        seq_id = sequence.findtext('ImageSequenceId')\n",
    "        \n",
    "        # get species info for this sequence\n",
    "        researcher_identifications = sequence.findall('ResearcherIdentifications')\n",
    "        species = set()\n",
    "        \n",
    "        for researcher_id in researcher_identifications:\n",
    "            identifications = researcher_id.findall('Identification')\n",
    "            for id in identifications:\n",
    "                species_common_name = clean_species_name(id.findtext('SpeciesCommonName'))\n",
    "                species.add(species_common_name)\n",
    "        \n",
    "        species_str = ';'.join(sorted(list(species)))\n",
    "\n",
    "        # add each image's info to database\n",
    "        images = sequence.findall('Image')\n",
    "        for img in images:\n",
    "            img_id = img.findtext('ImageId')\n",
    "            img_file_name = img.findtext('ImageFileName')\n",
    "            \n",
    "            assert img_file_name.lower().endswith('.jpg')  # some are .JPG and some are .jpg\n",
    "            \n",
    "            img_datetime = img.findtext('ImageDateTime')  # these are in different formats...\n",
    "            img_frame = clean_frame_number(img.findtext('ImageOrder'))\n",
    "            \n",
    "            full_img_id = 'datasetemammal.project{}.deployment{}.seq{}.frame{}.img{}'.format(project_id, deployment_id, seq_id, img_frame, img_id)\n",
    "        \n",
    "            full_img_path = os.path.join(deployment_path, img_file_name)\n",
    "        \n",
    "            entry = {\n",
    "                'id': full_img_id,\n",
    "                'width': 0,  # place holders\n",
    "                'height': 0,\n",
    "                'file_name': os.path.join('emammal', deployment, img_file_name),\n",
    "                'location': deployment_location,\n",
    "                'datetime': img_datetime,\n",
    "                'seq_id': seq_id,\n",
    "                'seq_num_frames': int(img_frame),\n",
    "                'label': species_str\n",
    "            }\n",
    "    \n",
    "            tasks.append((entry, full_img_path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Finished creating tasks.')\n",
    "num_workers = multiprocessing.cpu_count()\n",
    "pool = ThreadPool(num_workers)\n",
    "pbar = tqdm(total=len(tasks))\n",
    "        \n",
    "db_images = pool.starmap(_add_image, tasks)\n",
    "print('Waiting for processes to finish...')\n",
    "pool.close()\n",
    "pool.join()\n",
    "print('Done.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(tasks)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some images could not be opened by PIL. Inspecting them on local shows that they are corrupt image files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from PIL import Image "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "img_path = '/datadrive/emammal/p168d31859/d31859s15i2.JPG'\n",
    "im = Image.open(img_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:py35]",
   "language": "python",
   "name": "conda-env-py35-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
